Memebers

Patrick Litte, Manjola Chiappetta,

Summary

This section is a summary of the project.

instructions: Write an abstract (a kind of a summary) to describe your project. The abstract must be within 175 to 250 words (inclusive). To write the abstract, first state the problem you are addressing. For example, if your project is on Churn analysis, then give a brief explanation of it. Second, write the summary of your classification results (e.g., accuracy). Third, state key points about the post-predictive analysis and fourth, summarize your recommendations to the organization.

Workload Distribution

Member Name List of Tasks Preformed
Patrick Little - some tasks
Manjola Chiappetta - some tasks

Exploratory Data Analysis

In this section we will: - Look at the attribute types in the dataset - Find and missing values - Find max,min,mean and standard deviation of the atttributes - Determine any outlier values for the attributes under consideration - Analyze the distribution of numeric attributes

bank<-read.csv("https://raw.githubusercontent.com/PatLittle/CIND119-group-project/main/bank_marketing/bank.csv")

introduce(bank)
##   rows columns discrete_columns continuous_columns all_missing_columns
## 1 4521      17               10                  7                   0
##   total_missing_values complete_rows total_observations memory_usage
## 1                    0          4521              76857       495152
plot_intro(bank)

plot_missing(bank)

#plot_bar(bank, by = "y")
plot_histogram(bank)

plot_correlation(na.omit(bank), type = "d")

plot_prcomp(bank, variance_cap = 0.9, ncol =1L, nrow=1L)

str(bank)
## 'data.frame':    4521 obs. of  17 variables:
##  $ age      : int  30 33 35 30 59 35 36 39 41 43 ...
##  $ job      : chr  "unemployed" "services" "management" "management" ...
##  $ marital  : chr  "married" "married" "single" "married" ...
##  $ education: chr  "primary" "secondary" "tertiary" "tertiary" ...
##  $ default  : chr  "no" "no" "no" "no" ...
##  $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
##  $ housing  : chr  "no" "yes" "yes" "yes" ...
##  $ loan     : chr  "no" "yes" "no" "yes" ...
##  $ contact  : chr  "cellular" "cellular" "cellular" "unknown" ...
##  $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
##  $ month    : chr  "oct" "may" "apr" "jun" ...
##  $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
##  $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
##  $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
##  $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
##  $ poutcome : chr  "unknown" "failure" "failure" "unknown" ...
##  $ y        : chr  "no" "no" "no" "no" ...
colSums(is.na(bank))
##       age       job   marital education   default   balance   housing      loan 
##         0         0         0         0         0         0         0         0 
##   contact       day     month  duration  campaign     pdays  previous  poutcome 
##         0         0         0         0         0         0         0         0 
##         y 
##         0

Predictive Modeling / Classification

Decision Tree

###Decision Tree



bank_clean<- bank %>% mutate_if(is.character, factor)

set.seed(888)
bank_split <- initial_split(bank_clean, prop = 0.75, 
                             strata = y)

bank_training <- bank_split %>% training()
bank_test <- bank_split %>% testing()
bank_folds <- vfold_cv(bank_training, v = 10)



bank_recipe <- recipe(y ~ ., data = bank_training) 
 


bank_clean_baked<-bank_recipe %>% 
  prep() %>% 
  bake(new_data = bank_training)

tree_model <- decision_tree(cost_complexity = tune(),
                            tree_depth = tune(),
                            min_n = tune()) %>% 
  set_engine('rpart') %>% 
  set_mode('classification')

tree_workflow <- workflow() %>% 
  add_model(tree_model) %>% 
  add_recipe(bank_recipe)

tree_grid <- grid_latin_hypercube(cost_complexity(),
                          tree_depth(),
                          min_n(), 
                          size = 60)

set.seed(888)

tree_tuning <- tree_workflow %>% 
  tune_grid(resamples = bank_folds,
            grid = tree_grid)
## Warning: package 'vctrs' was built under R version 4.0.5
tree_tuning %>% show_best('roc_auc')
## # A tibble: 5 x 9
##   cost_complexity tree_depth min_n .metric .estimator  mean     n std_err
##             <dbl>      <int> <int> <chr>   <chr>      <dbl> <int>   <dbl>
## 1  0.000000000688         10    20 roc_auc binary     0.863    10 0.00653
## 2  0.00000448              9    28 roc_auc binary     0.862    10 0.00615
## 3  0.00000322              9    27 roc_auc binary     0.862    10 0.00623
## 4  0.00000000641          11    24 roc_auc binary     0.860    10 0.00646
## 5  0.00000874             11    23 roc_auc binary     0.860    10 0.00644
## # ... with 1 more variable: .config <chr>
best_tree <- tree_tuning %>% 
  select_best(metric = 'roc_auc')


final_tree_workflow <- tree_workflow %>% 
  finalize_workflow(best_tree)


tree_wf_fit <- final_tree_workflow %>% 
  fit(data = bank_training)

tree_fit <- tree_wf_fit %>% 
  pull_workflow_fit()

vip(tree_fit)

rpart.plot(tree_fit$fit, roundint = FALSE)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting

tree_last_fit <- final_tree_workflow %>% 
  last_fit(bank_split)

tree_last_fit %>% collect_metrics()
## # A tibble: 2 x 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.892 Preprocessor1_Model1
## 2 roc_auc  binary         0.827 Preprocessor1_Model1
tree_last_fit %>% collect_predictions() %>% 
  roc_curve(truth  = y, estimate = .pred_no) %>% 
  autoplot()

tree_predictions <- tree_last_fit %>% collect_predictions()

conf_mat(tree_predictions, truth = y, estimate = .pred_class)
##           Truth
## Prediction  no yes
##        no  949  71
##        yes  51  59
predict(tree_last_fit$.workflow[[1]],bank_test[15,])
## # A tibble: 1 x 1
##   .pred_class
##   <fct>      
## 1 no
saveRDS(tree_last_fit$.workflow[[1]],"./saved_model.Rds")

trained_model<-readRDS("saved_model.Rds")

Naive Bayes

set.seed(888)
nb_split <- initial_split(bank_clean, prop = 0.75, 
                            strata = y)

nb_training <- nb_split %>% training()
nb_test <- nb_split %>% testing()
nb_folds <- vfold_cv(nb_training, v = 10)

nb_recipe <- recipe(y ~ ., data = nb_training)
  


nb_wf <- workflow() %>%
  add_recipe(nb_recipe)

library(discrim)
## 
## Attaching package: 'discrim'
## The following object is masked from 'package:dials':
## 
##     smoothness
nb_spec <- naive_Bayes() %>%
  set_mode("classification") %>%
  set_engine("naivebayes")

nb_spec
## Naive Bayes Model Specification (classification)
## 
## Computational engine: naivebayes
nb_fit <- nb_wf %>%
  add_model(nb_spec) %>%
  fit(data = nb_training)

nb_wf_final <-  workflow() %>%
  add_recipe(nb_recipe) %>%
  add_model(nb_spec)

nb_rs <- fit_resamples(
  nb_wf_final,
  nb_folds,
  control = control_resamples(save_pred = TRUE)
)


nb_last_fit <- nb_wf_final %>% 
  last_fit(nb_split)

nb_last_fit %>% collect_metrics()
## # A tibble: 2 x 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.881 Preprocessor1_Model1
## 2 roc_auc  binary         0.849 Preprocessor1_Model1
nb_last_fit %>% collect_predictions() %>% 
  roc_curve(truth  = y, estimate = .pred_no) %>% 
  autoplot()

nb_predictions <- nb_last_fit %>% collect_predictions()
conf_mat(nb_predictions, truth = y, estimate = .pred_class)
##           Truth
## Prediction  no yes
##        no  956  91
##        yes  44  39

Conclusions and Recommendations

Some text wrapping up the report